import re
import csv
#from ListofFiles1 import files
from ListofFiles2 import files

## create or open csv
#filename = "Rules00to06.csv" #Does not include June 25, 2003
filename = "Rules07to15.csv"

f = open(filename, "w+b")
writer = csv.writer(f)
headers=["Date","Agency","SubAgency","CFR","SignatureDate","SignatoryName","SignatoryTitle", "RIN", "FRDOC", "EffectiveDate","ImportantDates", "Addresses","SubjectList","AUTH","Agency2", "Action", "Summary", "FirstPage", "LastPage", "RuleTitle"]
writer.writerow(headers)

#open the file
#files=["FR-2000-02-29.xml"]
for file in files:
	thefile=open(file)
	##store lines in the file as a python list
	lines=thefile.readlines()
	##Find where the rules begin and end
	ruleLocations=[]
	ruleENDLocations=[]
	for j in range(len(lines)):
		if re.search("<RULE>", lines[j])!=None:
			ruleLocations.append(j)
		if re.search("</RULE>", lines[j])!=None:
			ruleENDLocations.append(j)
	
	##Put FR date in document
	for m in range(len(ruleLocations)):
		stuffIWant=[]			
		stuffIWant.append(file)
		arule=lines[ruleLocations[m]:ruleENDLocations[m]]

##There's a problem where BILCOD, NAME, and TITLE can have multiple iterations. Will need to extend to include multiple.
##It would be good to find a way to grab <SUBJECT> </SUBJECT> without getting the entire rule
##Grab CFR, EFFDATE, AUTH, ACTION
##try to get prt pg?
		
		Items=["<AGENCY","<SUBAGY>","<CFR","<DATED>","<NAME>","<TITLE>","<RIN>"]	
		trial=[]
		for item in Items:
			for p in range(len(arule)):
				if len(re.findall(item, arule[p]))!=0:
					trial.append(arule[p])
			if len(trial)==0:
				trial.append("None")	
			trial2="".join(trial)
			trial2=re.sub("\s{2,}","",trial2)
			stuffIWant.append(trial2)
			trial=[]

		MultiLineItems=["<FRDOC>", "<EFFDATE","<DATES", "<ADD"]	
		MultiLineItemsEND=["</FRDOC>","</EFFDATE","</DATES", "</ADD"]
		start=0
		end=0
		TF=[]
		for k in range(len(MultiLineItems)):
			for j in range(len(arule)):
				TF.append(MultiLineItems[k] in arule[j])
				if re.search(MultiLineItems[k], arule[j])!=None:
					start=j
				if re.search(MultiLineItemsEND[k], arule[j])!=None:
					end=j
					string=arule[start:(end+1)]
				##This strips out extra spacing
					new_list=[]
					for n in range(len(string)):
						new_list.append(re.split("\s{2,}",string[n]))
					new_list=[item for sublist in new_list for item in sublist]
					while'' in new_list:
						new_list.remove('')
					stuffIWant.append(new_list)
		
			if any(TF)==False:
				stuffIWant.append("None")
			TF=[]
		
		trial=[]
		for p in range(len(arule)):
			if len(re.findall("<LSTSUB", arule[p]))!=0:
				trial.append(arule[p+1:p+4])
		if len(trial)==0:
			trial.append("None")	
		trial=[item for sublist in trial for item in sublist]	
		trial2="".join(trial)
		trial2=re.sub("\s{2,}","",trial2)
		stuffIWant.append(trial2)
		trial=[]
	
		trial=[]
		for p in range(len(arule)):
			if len(re.findall('<HD SOURCE="HED">Authority:</HD>\n', arule[p]))!=0:
				trial.append(arule[p+1:p+2])
		if len(trial)==0:
			trial.append("None")	
		trial=[item for sublist in trial for item in sublist]	
		trial2="".join(trial)
		trial2=re.sub("\s{2,}","",trial2)
		stuffIWant.append(trial2)
		trial=[]
		

##get multi-line stuff where I only need the last line of the multiline
		LastLineItems=["<AGY>", "<ACT>", "<SUM>"]
		LastLineItemsEND=["</AGY", "</ACT>", "</SUM>"]		
		start=0 ##This will mark the start of the multi line string 
		end=0 ##And this will mark the end
		TF=[] #This will help me tell if the line type I need is there at all.
		for k in range(len(LastLineItems)):
			for j in range(len(arule)):
				TF.append(LastLineItems[k] in arule[j])
				if re.search(LastLineItems[k], arule[j])!=None:
					start=j
				if re.search(LastLineItemsEND[k], arule[j])!=None:
					end=j
					string=re.split("\s{2,}",arule[start:end][-1])[1]
					stuffIWant.append(string) ##append the last line
			if any(TF)==False:
				stuffIWant.append("None")
			TF=[]
		
		trial=[]
		for p in range(len(arule)):
			if len(re.findall('PRTPAGE P="([0-9]+)', arule[p]))!=0:
				trial.append(re.findall('PRTPAGE P="([0-9]+)', arule[p]))
		if len(trial)==0:
			trial.extend([["None"], ["None"]])	
		trial=[item for sublist in trial for item in sublist]
		stuffIWant.append(trial[0])
		if len(trial)>1:
			stuffIWant.append(trial[-1])
		if len(trial)==1:
			stuffIWant.append(trial[0])
		
		trial=[]
		for p in range(len(arule)):
			if len(re.findall('<SUBJECT>', arule[p]))!=0:
				trial.append(arule[p])

		if len(trial)==0:
			trial.append("   None")	

		trial2=re.sub("\s{2,}","",trial[0])
		stuffIWant.append(trial2)

		#NewList=[]
		#stuffIWant=[item for sublist in stuffIWant for item in sublist]
		#for item in stuffIWant:
		#	if re.search('>(.*)<', item)!=None:
		#		string2=(" ".join(re.findall('>(.*)<', item)))
		#		NewList.append(string2)
		#	else:
		#		NewList.append(item)
		writer.writerow(stuffIWant)
f.close()